# stateYearControls.R

# Part of the replication archive for 
#
#   Bullock, John G. 2020. "Education and Attitudes toward Redistribution in
#   the United States." British Journal of Political Science 50.


# This file loads and codes state-year control variables from various sources.
# They are stored in the stateYearVars data frame.


library(Bullock, lib.loc = c(.libPaths(), 'packageLibrary'))  # for qw()
library(dplyr)    # for rename()
library(RCurl)
library(zoo)      # for na.approx()
options(
  RCurlOptions = list(
    capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), 
    ssl.verifypeer = FALSE))


# IMPORT STATE-YEAR DATA
CSV.URL <- 'https://docs.google.com/spreadsheets/d/1FDxKiJLGGaLuLj3sERQo-zdjvHvFWjEJ-gMXj9fPW7o/pub?single=true&gid=0&output=csv'
CSV.stateYearVars      <- getURL(CSV.URL) 
CSV.stateYearVars.cnxn <- textConnection(CSV.stateYearVars)
stateYearVars          <- read.csv(
  CSV.stateYearVars.cnxn, 
  col.names  = c('fullstate', 'state', 'year', 'VAP.turnout', 
                 'pop', 'popSource', 'incomePerCapita', 'incomeSource',
                 'percentBlack',               'percentBlackSource',
                 'percentForeignBorn',         'percentForeignBornSource', 
                 'percentUrban',               'percentUrbanSource', 
                 'percentWorkInManufacturing', 'percentWorkInManufacturingSource',
                 'doctorsPerCapita',           'doctorsPerCapitaSource',
                 'higherEdEnrollment',         'higherEdEnrollmentSource',
                 'teacherSalaries',            'teacherSalariesSource',
                 'numberOfTeachers',           'numberOfTeachersSource',
                 'enrollment',                 'enrollmentSource',
                 'instructionalStaffSalaries', 'instructionalStaffSalariesSource',
                 'PresidentParty'), 
  colClasses = c('character', 'factor', 'integer', 'numeric',
                 'integer', 'character', 'integer', 'character',
                 'numeric', 'character', 
                 'numeric', 'character',
                 'numeric', 'character',
                 'numeric', 'character',
                 'numeric', 'character',
                 'numeric', 'character',
                 'numeric', 'character',
                 'numeric', 'character',
                 'numeric', 'character',
                 'numeric', 'character',
                 'character'),    
  na.strings = c('NA', '(NA)'))  
close(CSV.stateYearVars.cnxn)


# IMPORT CPI DATA (NEEDED TO ADJUST TEACHER SALARIES FOR INFLATION)
# The table of inflation data is included in the replication archive, but you 
# can retrieve the same data (albeit in a different form) directly from 
# https://fred.stlouisfed.org/series/CPIAUCNS.  [2019 07 17]
CPI      <- read.table(
  file   = 'data/cpiai.txt',
  sep    = '',                   # white space is the separator
  skip   = 16,
  fill   = TRUE,
  header = TRUE)
CPI <- CPI[, c('Year', 'Avg.')]  # take average CPI for the calendar year
colnames(CPI)[2] <- 'CPI'

# Merge CPI into stateYearVars
stateYearVars <- merge(
  x     = data.frame(stateYearVars, rownum = 1:nrow(stateYearVars)),
  y     = CPI, 
  by.x  = c('year'), 
  by.y  = c('Year'), 
  all.x = TRUE, 
  all.y = FALSE, 
  sort  = FALSE)
  
  
# IMPORT CHAMBER MEDIAN (DW-NOMINATE SCORE FOR MEDIAN HOUSE MEMBER)
tmp <- read.fwf(
  file      = 'data/hmedians_1-112.txt',
  skip      = 110,  # start at 56th Congress (1899-1900)
  widths    = c(4, 4, 7),
  col.names = c('CongNum', 'LegislatorsNum', 'HouseMedian'))
tmp <- tmp[complete.cases(tmp),]
HouseMedian <- data.frame(
  USHouseMedian   = rep(tmp$HouseMedian, each = 2),
  year            = 1899:(1899 + 2*length(tmp$HouseMedian) - 1))
stateYearVars <- merge(
  x     = stateYearVars,
  y     = HouseMedian, 
  by.x  = c('year'), 
  by.y  = c('year'), 
  all.x = TRUE, 
  all.y = FALSE, 
  sort  = FALSE)


# RE-ORDER THE MERGED DATASET
# merge() returns the data in an arbitrary order.  Here, I use order() to 
# make the order of the merged dataset correspond to the order of the original
# stateYearVars dataset.  
stateYearVars <- stateYearVars[order(stateYearVars$rownum), ]  


# CREATE DUMMY VARIABLE FROM PRESIDENTPARTY
stateYearVars$PresidentParty <- stateYearVars$PresidentParty == 'Republican'
stateYearVars <- rename(stateYearVars, RepublicanPresident = PresidentParty)


# INTERPOLATE MISSING VALUES FOR TURNOUT AND FOR DECENNIAL-ONLY DATA
# For some characteristics, like percentBlack, I have data only in Census 
# years (1900, 1910, etc.). Here, I use linear interpolation to impute values  
# for the non-Census years.
stateYearVars <- stateYearVars %>%
  arrange(state, year) %>%
  group_by(state) %>%
  mutate_at(
    .vars = qw("VAP.turnout doctorsPerCapita higherEdEnrollment percentBlack percentForeignBorn percentUrban percentWorkInManufacturing"),
    .funs = na.approx,
    na.rm = FALSE) %>%
  mutate(higherEdEnrollmentPerCapImputed = higherEdEnrollment / pop)


# INTERPOLATE MISSING VALUES FOR EDUCATION CONTROL VARIABLES
stateYearVars$blendedTeacherSalaries <- stateYearVars$teacherSalaries            
stateYearVars$blendedTeacherSalaries[is.na(stateYearVars$teacherSalaries)] <- stateYearVars$instructionalStaffSalaries[is.na(stateYearVars$teacherSalaries)]
stateYearVars$blendedTeacherSalaries <- with(stateYearVars, blendedTeacherSalaries * 100 / CPI)  # adjust for inflation
stateYearVars <- stateYearVars %>%
  mutate(blendedTeacherSalaries = na.approx(blendedTeacherSalaries, na.rm = FALSE))

stateYearVars <- stateYearVars %>%
  mutate_at(
    .vars = qw("numberOfTeachers enrollment"),
    .funs = na.approx,
    na.rm = FALSE) %>%
  mutate(teacherStudentRatio = enrollment / numberOfTeachers)
